#######################################################################

#######################################################################

## (c) Ramya Parthasarathy

## This file pre-processes data from the village assembly transcripts collected as part of an impact evaluation of the Pudhu Vaazhvu Project, a poverty alleviation and livelihoods program implemented by the World Bank and Government of Tamil Nadu. 

## Village assembly proceedings were recorded in a matched sample of 50 treatment and 50 control villages in on Republic Day, in January 2014. Recordings were then transcribed into Tamil, and then manually translated into English by a team from our survey firm.

## Transcripts were then cleaned manually, formatted into a nested dictionary to extract relevant meta-data (geographic identifiers, gender and position of the speaker, etc.), and exported to a flat .csv file. See Transcripts_DTM.py and flatten_speeches.py for this code.


#######################################################################

#######################################################################

rm(list = ls())

## Install packages (see below)

## Load Packages
pkgs <- c("xtable", "sandwich", "dplyr", "gdata",
  "ggplot2", "foreign", "gtools", "lmtest", "reshape2", "lfe", 
  "stargazer", "stm", "tm", "SnowballC", "wordcloud")

#install.packages(pkgs)
sapply(pkgs, require, character.only=TRUE)
sessionInfo()

#######################################################################

#######################################################################

######################### CLEAN & RECODE DATA  ########################

#######################################################################

#######################################################################

## Load Gram Sabha Transcript Data
setwd("~/Dropbox/wb-pvp/GS_Stata/Transcript_Analysis/")
df <- read.csv("gs_speeches1.csv", header=T)

names(df)

## Drop first column (no info)
df <- df[, -1]

## Clean up gender, project status, position coding
df$gender <- ifelse(substr(df$gender, 1, 1) == "m", "male",
	ifelse(substr(df$gender, 1, 1) == "f", "female", "public"))

table(df$gender)
df$gender <- as.factor(df$gender)
levels(df$gender)

table(df$position)

#######################################################################
#######################################################################
## Drop enumerator speeches
df <- subset(df, position != "enumerator" & 
	position != "enuerator" & position != "enumeratorreporter")


df$position_new <- ifelse(df$position == "citizen" |  
	df$position == "children" | 
	df$position == "waterman" | 
	df$position == "fairpriceshopowner" | 
	df$position == "female" | 
	df$position == "male" | 
	df$position == "retiredofficial" | 
	df$position == "shg" | 
	df$position == "st" | 
	df$position == "sc" | 
	df$position == "shoprep" | 
	df$position == "villager" | 
	df$position == "youth" |
	df$position == "youthrep",
	"citizen", 
	ifelse(
		df$position == "accountant" | 
		df$position == "adminofficer" |
		df$position == "anganwadi" |
		df$position == "angandwadi" | 
		df$position == "assistantblockofficer" | 
		df$position == "assistantplanmanager" |
		df$position == "bdo" | 
		df$position == "deputybdo" | 
		df$position == "deputycollector" |
		df$position == "deputyprimaryeducationofficer" | 
		df$position == "districtsecretary" |
		df$position == "blockdevelopmentofficer" |
		df$position == "clerk" |
		df$position == "coopemployee" | 
		df$position == "headmaster" | 
		df$position == "headmistress" | 
		df$position == "newlifeplanofficer" | 
		df$position == "officer" | 
		df$position == "official" | 
		df$position == "officials" | 
		df$position == "projectemployee" | 
		df$position == "projectmanager" | 
		df$position == "projectsupervisor" | 
		df$position == "middaymealemployee" | 
		df$position == "nutritiousmealstaff" | 
		df$position == "pvpofficer" | 
		df$position == "pvpofficial" | 
		df$position == "pvpplanofficer" | 
		df$position == "police" | 
		df$position == "panchayatsecretary" | 
		df$position == "rationofficer" | 
		df$position == "rationstaff" | 
		df$position == "secretary" | 
		df$position == "secretarysubject20" | 
		df$position == "teacher" | 
		df$position == "teamleader" | 
		df$position == "vao" | 
		df$position == "femalepvp" | 
		df$position == "villageoffer" | 
		df$position == "pvpstaff" | 
		df$position == "pvpteamleader" | 
		df$position == "waterman",
		"official_admin", 
		ifelse(df$position == "pvp" |
			df$position == "pvpaccountant" | 
			df$position == "pvpteamleader" | 
			df$position == "cst" | 
			df$position == "vprc" | 
			df$position == "vprcaccountant" | 
			df$position == "vprcsecreary" | 
			df$position == "vprcsecretary" | 
			df$position == "vprcstaff" | 
			df$position == "tnsrlm" |
			df$position == "ptc" | 
			df$position == "plfmember" | 
			df$position == "sac" |
			df$position == "formervprcsecretary" | 
			df$position == "plf" | 
			df$position == "plfsecretary",
			"PVP",
	"official_elected")))

table(df$position, df$position_new)
class(df$position_new)

levels(df$position_new) <- c("PVP", "citizen", 
	"official_admin", "official_elected")

df$position_new <- as.factor(df$position_new)
table(df$position_new)

## Fix position for control areas
	## NB: "PVP" speakers in control areas are from neighborhing villages and come to speak about the program.  They are essentially admins who are introducing the initiative.

df$position_new <- ifelse(df$position_new=="PVP" & 
	df$projectstatus=="Non-PVP", 
	3, df$position_new)


df$position_new <- as.factor(df$position_new)
levels(df$position_new) <- c("PVP", "citizen", 
	"official_admin", "official_elected")

table(df$position_new, df$projectstatus)

df$citizen <- ifelse(df$position_new == "official_admin" | 
	df$position_new == "official_elected", 0, 1)
table(df$position_new, df$citizen)

df$admin <- ifelse(df$position_new == "official_admin", 1, 0)
table(df$position_new, df$admin)

df$pol <- ifelse(df$position_new == "official_elected", 1, 0)
table(df$position_new, df$pol)

df$pvp <- ifelse(df$position_new == "PVP", 1, 0)

df$projectstatus <- as.character(df$projectstatus)
df$projectstatus <- ifelse(df$projectstatus == "non-PVP", "Non-PVP", 
	df$projectstatus)
df$projectstatus <- as.factor(df$projectstatus)
table(df$projectstatus)



## Clean district names
df$district <- as.character(df$district)
df$district <- ifelse(df$district=="kancheepuram ", 
	"kancheepuram", df$district)
df$district <- ifelse(df$district=="thiruvallur " | 
	df$district=="thiruvalluvar", "thiruvallur", df$district)
df$district <- ifelse(df$district=="villupuram", "viluppuram", 
	df$district)


## Clean block names
df$block <- as.character(df$block)
df$block <- ifelse(df$block == "erumapatti", "erumaipatti", 
	df$block)
df$block <- ifelse(df$block == "kuthalam\t", "kuthalam", 
	df$block)
df$block <- ifelse(df$block == "sembur", "sembanarkoil", df$block)


## Clean village names
df$village <- as.character(df$village)
df$village <- ifelse(df$village == "mugaiyur\t", "mugaiyur", 
	df$village)
df$village <- ifelse(df$village == "che. pachal", "pachal", 
	df$village)
df$village <- ifelse(df$village == "muthugapatti", "muthugapattti", 
	df$village)

## Caps for geographic identifiers
df$district <- toupper(df$district)
df$block <- toupper(df$block)
df$village <- toupper(df$village)

## Calculate speech lengths (raw words)
df$speechlength <- sapply(gregexpr("\\W+", df$raw_content), length) + 1

## Dummy for female speaker
df$female <- ifelse(df$gender == "female", 1, 0)
df$male <- ifelse(df$female == 1, 0, 1)

## Reorder columns
df = df[,c("district", "block", "village", "projectstatus", #"treat",
		"speech_id", "gender", "female", "male", "position_new", "citizen", 
		"admin", "pol", "position", "speechlength", "noise", 
		"raw_content", "cleaned_content")]


require(gdata)
df <- rename.vars(df, from=c("district", "block", "village"), 
	to=c("DIST", "BLOCK", "GP"))


#######################################################################

######################## IDs from FACESHEET DATA  #####################

#######################################################################
setwd("~/Dropbox/wb-pvp/GS_Stata/GS_data/")
require(foreign)

## Extract Geogrphic Identifiers
df_fs <- read.dta("merged_facesheets_GP_s12.dta")
names(df_fs)
geo_vars <- c("DIST", "BLOCK", "GP", "GPID_s")
geo_code <- subset(df_fs, select = geo_vars)
names(geo_code)

## Check whether names match across data sets
## Districts
fs_dist <- sort(unique(as.character(df_fs$DIST)))
gs_dist <- sort(unique(as.character(df$DIST)))

geo_match <- as.data.frame(cbind(fs_dist, gs_dist))

geo_match$match <- ifelse(fs_dist == gs_dist, 1, 0)
geo_match

## Block
fs_block <- sort(unique(as.character(df_fs$BLOCK)))
gs_block <- sort(unique(as.character(df$BLOCK)))

geo_match <- as.data.frame(cbind(fs_block, gs_block))

geo_match$match <- ifelse(fs_block == gs_block, 1, 0)
geo_match

## GPs
fs_gp <- sort(unique(as.character(df_fs$GP)))
gs_gp <- sort(unique(as.character(df$GP)))


geo_match <- as.data.frame(cbind(fs_gp, gs_gp))

geo_match$match <- ifelse(fs_gp == gs_gp, 1, 0)
geo_match

## Merge in GS identifiers
df <- merge(df, geo_code, by=c("DIST", "BLOCK", "GP"))
names(df)


#####################################################################################################################################################################################################################
## Add in relevant covariates from matching data
#####################################################################################################################################################################################################################

pscore_vars = c("No_HH_fin", "SC_prop_fin",
 "ST_prop_fin", "V2B01", "V2B04", "V2A02_1", "V2A02_2", 
 "V2A02_3", "V2A02_4", "V2A02_5", "bank_dum")

vars = c("sc_pres", "fem_pres",  "fem_att", "male_att", 
	"tot_att", "female_literacy", "male_literacy", "d_pair_group",
	"total_literacy", "sex_ratio", "pscore2011", "GPID_s", "projectstatus",
	"n_govt", "n_govt_fem", "n_govt_male")

covs <- subset(df_fs, select = c(vars, pscore_vars))
names(covs)

covs <- rename.vars(covs, from=c("V2B01", "V2B04", "V2A02_1", 
	"V2A02_2", "V2A02_3", "V2A02_4", "V2A02_5"), 
	to=c("Prim_school", "Sec_school", "Health_Ctr", 
		"Hospital", "Clinic", "Med_Shop", "Govt_Hospital"))
names(covs)


#######################################################################
## LPM for Treatment Status
#######################################################################
## Pscores generated based on demographics and indicators for particular public goods

## Number of HH
## Perc SC
## Perc ST
## Indicator for Primary School
## Indicator for High School
## Facility Indicators
	# Primary Health Centre
	# Private Hospital
	# Private Clinic
	# Medicine Shop
	# Big Govt Hospital
## Indicator for Bank

table(covs$projectstatus)
covs$treat <- as.factor(covs$projectstatus)
levels(covs$treat)
covs$treat <- ifelse(covs$treat=="Project", 1, 0)
table(covs$treat)
covs$d_pair_group <- ifelse(is.na(covs$d_pair_group), 15, 
	covs$d_pair_group)
table(covs$d_pair_group)

table(covs$projectstatus)
table(covs$projectstatus, covs$sc_pres)

table(covs$projectstatus, covs$fem_pres)

lpm <- felm(treat ~ No_HH_fin + SC_prop_fin + ST_prop_fin + 
	Prim_school + Sec_school + Health_Ctr + Hospital + Clinic + 
	Med_Shop + Govt_Hospital + bank_dum  | d_pair_group | 0 | 0, 
	data =covs)

summary(lpm)

##############################################################################################################################################
## Merge covariates to documents
##############################################################################################################################################

df <- merge(df, covs, by = "GPID_s")
names(df)

##############################################################################################################################################
# Confirm that treatment status from facesheet = from transcript
table(df$projectstatus.y, df$projectstatus.x)
table(df$treat)

## Fix error for Eachur. Incorrect on original transcript.  
df$projectstatus.x <- ifelse(df$GPID_s==100201, 1, df$projectstatus.x)

## NB: I have since fixed the transcript but haven't re-processed the text data.  As such, I'm just manually fixing it here.


#######################################################################
#######################################################################
## Missing literacy data for
covs[is.na(covs$female_literacy),]$GPID_s
covs[is.na(covs$male_literacy),]$GPID_s
covs[is.na(covs$total_literacy),]$GPID_s

df[df$GPID_s=="020104",]$d_pair_group
df[df$GPID_s=="020110",]$d_pair_group


#######################################################################
#######################################################################
## Genderate indicators for before and after speaker genders

## Reroder speeches
names(df)
df$speech_id <- as.character(df$speech_id)
df$speech_order <- as.numeric(substr(df$speech_id, 7, 10))
df$speech_order
df <- df[order(df$GPID_s, df$speech_order),]

## Gender of prior speaker
## GP list 
GPs <- unique(df$GPID_s)

df_new <- df %>%
	group_by(GPID_s) %>%
	mutate(last_speech = ifelse(speech_order == max(speech_order), 1, 0)) %>%
	mutate(first_speech = ifelse(speech_order == min(speech_order), 1, 0))

df_new$next_fem <- rep(NA, nrow(df_new))
df_new$prev_fem <- rep(NA, nrow(df_new))

df_new$next_pos <- rep(NA, nrow(df_new))
df_new$prev_pos <- rep(NA, nrow(df_new))

df_new$next_pvp <- rep(NA, nrow(df_new))


for(i in 1:nrow(df_new)){
	df_new$next_fem[i] <- ifelse(df_new$last_speech[i]==1, 
		0, df_new$female[i+1])

	df_new$next_pos[i] <- ifelse(df_new$last_speech[i]==1, 
		0, df_new$position_new[i+1])

	df_new$prev_fem[i] <- ifelse(df_new$first_speech[i]==1,
		0, df_new$female[i-1])

	df_new$prev_pos[i] <- ifelse(df_new$first_speech[i]==1,
		0, df_new$position_new[i-1])

	df_new$next_pvp[i] <- ifelse(df_new$last_speech[i]==1,
		0, df_new$pvp[i-1])
}



table(df_new$next_pos)
df_new$next_pos <- as.factor(df_new$next_pos)
levels(df_new$next_pos) <- c("NA", "PVP", "citizen", 
	"official_admin", "official_elected")
table(df_new$next_pos)

df_new$official_response <- ifelse(df_new$next_pos == "official_admin" | 
	df_new$next_pos == "official_elected", 1, 0)

df_new$admin_response <- ifelse(df_new$next_pos == "official_admin", 1, 0)
df_new$elected_response <- ifelse(df_new$next_pos == "official_elected", 1, 0)

#######################################################################
#######################################################################

## Save final data set
setwd("/Users/ramyaparthasarathy/Dropbox/Women_Delib/Data")
save(df_new, 
	file="GS_data2.Rda")

save(covs, 
	file="GS_covs.Rda")
